In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log, sqrt
from sklearn import linear_model
In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
In [3]:
sales = pd.read_csv('../datasets/kc_house_data.csv', dtype=dtype_dict)
testing = pd.read_csv('../datasets/wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('../datasets/wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('../datasets/wk3_kc_house_valid_data.csv', dtype=dtype_dict)
In [4]:
def get_numpy_data(data_sframe, features, output):
data_sframe['constant'] = 1 # add a constant column to an SFrame
# prepend variable 'constant' to the features list
features = ['constant'] + features
# select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
features_sframe = data_sframe[features]
# this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
features_matrix = np.matrix(features_sframe)
# assign the column of data_sframe associated with the target to the variable ‘output_sarray’
output_sarray = data_sframe[output]
# this will convert the SArray into a numpy array:
output_array = np.array(output_sarray) # GraphLab Create>= 1.7!!
return(features_matrix, output_array)
In [5]:
def predict_outcome(feature_matrix, weights):
predictions = np.dot(feature_matrix, weights)
return(predictions)
In [6]:
def normalize_features(features):
norms = np.linalg.norm(features, axis=0)
normalized_features = features / norms
return (normalized_features, norms)
In [7]:
#simple_features = ['sqft_living','bedrooms']
#my_output= 'price'
#(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)
#initial_weights = np.array([1., 4., 1.])
#(normalized_simple_feature_matrix, norms) = normalize_features(simple_feature_matrix)
#print(simple_feature_matrix)
In [8]:
#prediction = predict_outcome(normalized_simple_feature_matrix, initial_weights)
#print (prediction)
#print (output)
#print (output - prediction)
In [9]:
# Not working for some reason and not sure what this part of the assignment does
#i = 0
#ro=[]
#for feature in simple_features:
#print (simple_feature_matrix[:,i])
#ro[i] = SUM[ [feature_i]*(output - prediction + w[i]*[feature_i]) ]
#ro.append(sum(simple_feature_matrix[:,i] * (output - prediction + initial_weights[i] * simple_feature_matrix[:,i])))
#i = i + 1
#print (ro)
In [22]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
# compute prediction
prediction = predict_outcome(feature_matrix, weights)
# compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
feature_i = feature_matrix[:,i]
#ro_i = (feature_i * (output - prediction + np.dot(np.transpose(weights[i]),feature_i))).sum()
ro_i = (np.transpose(feature_i) * (output - prediction + weights[i] * feature_i)).sum()
if i == 0: # intercept -- do not regularize
new_weight_i = ro_i
elif ro_i < -l1_penalty/2.:
new_weight_i = ro_i + l1_penalty/2.
elif ro_i > l1_penalty/2.:
new_weight_i = ro_i - l1_penalty/2.
else:
new_weight_i = 0.
return new_weight_i
In [23]:
# should print 0.425558846691
import math
print (lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
[2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1))
In [24]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
iterations = 0
while True:
max_change = 0
for i in range(len(initial_weights)):
old_weight = initial_weights[i]
initial_weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, initial_weights, l1_penalty)
change = abs(old_weight-initial_weights[i])
if change > max_change:
max_change = change
print (max_change)
iterations += 1
if max_change < tolerance:
print ('Done in: ', iterations, ' iterations.')
break
return initial_weights
In [25]:
simple_features = ['sqft_living', 'bedrooms']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)
(normalized_simple_feature_matrix, norms) = normalize_features(simple_feature_matrix)
initial_weights = np.array([0., 0., 0.])
L1_penalty = 1e7
tolerance = 1.0
In [26]:
print (output)
print (normalized_simple_feature_matrix)
In [27]:
opt_weights = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output, initial_weights, L1_penalty, tolerance)
print (opt_weights)
In [ ]:
RSS = (np.array(np.dot(normalized_simple_feature_matrix, opt_weights) - output) ** 2).sum()
print (RSS)
In [ ]:
In [ ]: